scRNAseq: Bioconductor

DropletUtils: Bioconductor

scater: Bioconductor, GitHub, Paper

ggbeeswarm: CRAN, GitHub

Demo Dataset: BachMammaryData from Differentiation dynamics of mammary epithelial cells revealed by single-cell RNA sequencing Nat Commun. 2017;8(1):2128.

License: GPL-3.0

Start R & prepare environment

Start R

cd /ngs/scRNA-seq-Analysis-Demo

R

Load R libraries

suppressPackageStartupMessages({
    library(scRNAseq)
    library(scater)
    library(DropletUtils)
    library(ggbeeswarm) # geom_quasirandom
})

Set up colour palettes

c30 <- c("#1C86EE", #1 dodgerblue2
         "#FF0000", #2 red1
         "#008B00", #3 green4
         "#FF7F00", #4 DarkOrange1
         "#00FF00", #5 green1
         "#A020F0", #6 purple
         "#0000FF", #7 blue1
         "#FF1493", #8 DeepPink1
         "#8B4500", #9 DarkOrange4
         "#000000", #10 black
         "#FFD700", #11 gold1
         "#00CED1", #12 DarkTurquoise
         "#68228B", #13 DarkOrchid4
         "#FF83FA", #14 orchid1
         "#B3B3B3", #15 gray70
         "#B03060", #16 maroon
         "#7CCD7C", #17 PaleGreen3
         "#333333", #18 gray20
         "#D8BFD8", #19 thistle
         "#FFC125", #20 goldenrod1
         "#EEE685", #21 khaki2
         "#7EC0EE", #22 SkyBlue2
         "#36648B", #23 SteelBlue4
         "#54FF9F", #24 SeaGreen1
         "#8B8B00", #25 yellow4
         "#CDCD00", #26 yellow3
         "#F08080", #27 LightCoral
         "#A52A2A", #28 brown
         "#00008B", #29 blue4
         "#CD2626"  #30 firebrick3
)

pie(rep(1,30), col = c30, radius = 1)

Choose sample & cluster colours

# Choosing colours for samples, n = 8
sample_col <- c30[c(1:8)]

# Choosing colours for clutsers, n = 15
cluster_col <- c30[c(1:15)]

# Choosing colours for Venn diagram, n = 3
circle.col <- c30[c(23,24,27)]

Load scRNA-seq data

Import public scRNA-seq data

In this example, we will use the example data set from the scRNAseq Bioconductor package. It contains expression matrices for several public scRNA-seq datasets in the form of SingleCellExperiment objects. The BachMammaryData function will download and import the mouse mammary gland single-cell RNA-seq data obtained with the 10x Genomics Chromium platform from Bach et al. (2017). The object contains 25,806 barcodes, cell annotations that includes the sample ID and condition, and the gene annotations that includes the Ensembl gene ID and gene symbol.

sce <- BachMammaryData()
sce
## class: SingleCellExperiment 
## dim: 27998 25806 
## metadata(0):
## assays(1): counts
## rownames: NULL
## rowData names(2): Ensembl Symbol
## colnames: NULL
## colData names(3): Barcode Sample Condition
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
# Print the number of genes and cells in the object
paste0("Number of genes: ", nrow(sce))
## [1] "Number of genes: 27998"
paste0("Number of cells: ", ncol(sce))
## [1] "Number of cells: 25806"
# Cell info
str(colData(sce))
## Formal class 'DFrame' [package "S4Vectors"] with 6 slots
##   ..@ rownames       : NULL
##   ..@ nrows          : int 25806
##   ..@ listData       :List of 3
##   .. ..$ Barcode  : chr [1:25806] "AAACCTGAGGCCATAG-1" "AAACCTGCATCGGGTC-1" "AAACGGGTCAAACGGG-1" "AAAGATGAGATAGCAT-1" ...
##   .. ..$ Sample   : chr [1:25806] "NP_1" "NP_1" "NP_1" "NP_1" ...
##   .. ..$ Condition: Named chr [1:25806] "Nulliparous" "Nulliparous" "Nulliparous" "Nulliparous" ...
##   .. .. ..- attr(*, "names")= chr [1:25806] "NP" "NP" "NP" "NP" ...
##   ..@ elementType    : chr "ANY"
##   ..@ elementMetadata: NULL
##   ..@ metadata       : list()
# Gene info
str(rowData(sce))
## Formal class 'DFrame' [package "S4Vectors"] with 6 slots
##   ..@ rownames       : NULL
##   ..@ nrows          : int 27998
##   ..@ listData       :List of 2
##   .. ..$ Ensembl: chr [1:27998] "ENSMUSG00000051951" "ENSMUSG00000089699" "ENSMUSG00000102343" "ENSMUSG00000025900" ...
##   .. ..$ Symbol : chr [1:27998] "Xkr4" "Gm1992" "Gm37381" "Rp1" ...
##   ..@ elementType    : chr "ANY"
##   ..@ elementMetadata: NULL
##   ..@ metadata       : list()

Import Cell Ranger data

Note: Skip if using the BachMammaryData dataset.

# Define sample ID
sample_id <- c("Sample1", "Sample2", "Sample3", "Sample4")

To import scRNA-seq data generated by Cell Ranger, we can using the read10xCounts function from the DropletUtils package, which will produce a SingleCellExperiment object containing count data for each gene (row) and cell (column) across all samples.

Import 10X data

A. From matrix files

Edit the script below to point cr_mat_path to the directory containing CellRanger output files: barcodes.tsv.gz, features.tsv.gz, and matrix.mtx.gz.

# Define data location
cr_mat_path <- "path-to-cellranger-output-folder" # a folder

# Check if these files exist: barcodes.tsv.gz, features.tsv.gz, matrix.mtx.gz
check_matrix_input <- function(cr_mat_path) {
    error <- FALSE
    if(! file.exists(paste0(cr_mat_path,"/barcodes.tsv.gz"))) {
            error <- TRUE
            print("'barcodes.tsv.gz' not found")
    }

    if(! file.exists(paste0(cr_mat_path,"/features.tsv.gz"))) {
            error <- TRUE
            print("'features.tsv.gz' not found")
    }

    if(! file.exists(paste0(cr_mat_path,"/matrix.mtx.gz"))) {
            error <- TRUE
            print("'matrix.mtx.gz' not found")
    } 

    if(isTRUE(error)) {
            stop("Stopped!")
    } else {
            print("All files found.")
    }       
}

check_matrix_input(cr_mat_path)

# Import data
sce <- DropletUtils::read10xCounts(cr_mat_path, sample.names = sample_id)
sce

# Print the number of genes and cells in the object
paste0("Number of genes: ", nrow(sce))
paste0("Number of cells: ", ncol(sce))

B. From HDF5 files

Edit the script below to point cr_h5_path to the path of the filtered_feature_bc_matrix.h5 file.

# Define data location
cr_h5_file <- "path-to-cellranger-output-folder/filtered_feature_bc_matrix.h5" # a h5 file

# Check if this files exist
file.exists(cr_h5_file)

# Import data
sce <- DropletUtils::read10xCounts(cr_h5_file, sample.names = sample_id)
sce

# Print the number of genes and cells in the object
paste0("Number of genes: ", nrow(sce))
paste0("Number of cells: ", ncol(sce))

Add cell and gene annotations

Note: Skip if using the BachMammaryData dataset.

colnames(sce) <- colData(sce)$Barcode
rownames(sce) <- rowData(sce)$Symbol

# Cell info
str(colData(sce))

# Gene info
str(rowData(sce))

Generate QC metrics

Define mitochondrial genes

We use grep to perform pattern matching to look for genes that are from the mitochondrial genome (chrM). The regular expression "^mt-|^MT-" will work for both human (MT-) and mouse (mt-) mitochondrial genomes. The ^ anchor is to ensure the pattern is matched to the beginning of the gene symbol.

# The subset feature can be supplied as character vector of feature names, a logical vector,
# or a numeric vector of indices
#is.mito <- grepl("^mt-|^MT-", rowData(sce)$Symbol) # a logical vector
is.mito <- grep("^mt-|^MT-", rowData(sce)$Symbol)   # numeric vector of indices

# Print matched genes
rowData(sce)$Symbol[is.mito]
##  [1] "mt-Nd1"  "mt-Nd2"  "mt-Co1"  "mt-Co2"  "mt-Atp8" "mt-Atp6" "mt-Co3"  "mt-Nd3" 
##  [9] "mt-Nd4l" "mt-Nd4"  "mt-Nd5"  "mt-Nd6"  "mt-Cytb"

Add QC metrics

With an older version of scater, we would use the calculateQCMetrics function to add QC metrics, but it is now deprecated. We will instead use addPerCellQC and addPerFeatureQC and add additional metrics to colData and rowData instead.

Add per-cell QC

Note: A pseudocount of 1 is added to avoid undefined values after the log-transformation.

# "sum" - sum of counts for the cell (library size)
# "detected" - number of genes for the cell that have counts above the detection limit (default 0)
sce <- addPerCellQC(sce, list(MT = is.mito))

# Add additional stats to per-cell QC
pseudocount = 1
colData(sce)$log10_sum <- log10(colData(sce)$sum + pseudocount)
colData(sce)$log10_detected <- log10(colData(sce)$detected + pseudocount)
colData(sce)$log10_genes_per_umi <- colData(sce)$log10_sum / colData(sce)$log10_detected

Add per-feature QC

# "mean" - mean counts for each gene across all cells
# "detected" - percentage of expressing cells, i.e.cells with non-zero counts for each gene
sce <- addPerFeatureQC(sce)
sce
## class: SingleCellExperiment 
## dim: 27998 25806 
## metadata(0):
## assays(1): counts
## rownames: NULL
## rowData names(4): Ensembl Symbol mean detected
## colnames: NULL
## colData names(16): Barcode Sample ... log10_detected log10_genes_per_umi
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
# Cell data
names(colData(sce))
##  [1] "Barcode"             "Sample"              "Condition"          
##  [4] "sum"                 "detected"            "percent_top_50"     
##  [7] "percent_top_100"     "percent_top_200"     "percent_top_500"    
## [10] "subsets_MT_sum"      "subsets_MT_detected" "subsets_MT_percent" 
## [13] "total"               "log10_sum"           "log10_detected"     
## [16] "log10_genes_per_umi"
colData(sce)
## DataFrame with 25806 rows and 16 columns
##                  Barcode      Sample       Condition       sum  detected   percent_top_50
##              <character> <character>     <character> <numeric> <integer>        <numeric>
## 1     AAACCTGAGGCCATAG-1        NP_1     Nulliparous     13152      3729 23.0383211678832
## 2     AAACCTGCATCGGGTC-1        NP_1     Nulliparous     12147      3301 27.6446859306825
## 3     AAACGGGTCAAACGGG-1        NP_1     Nulliparous     12095      3227 28.8879702356346
## 4     AAAGATGAGATAGCAT-1        NP_1     Nulliparous     12879      3771 21.3215311747806
## 5     AAAGATGAGATATGCA-1        NP_1     Nulliparous     12312      3627 26.9818063677713
## ...                  ...         ...             ...       ...       ...              ...
## 25802 TTTGTCAGTGATAAGT-1        PI_2 Post-involution     13343      3386 28.9140373229409
## 25803 TTTGTCAGTGTAACGG-1        PI_2 Post-involution      3073      1345 29.6452977546372
## 25804 TTTGTCAGTGTTTGGT-1        PI_2 Post-involution     10834      3009 30.0812257707218
## 25805 TTTGTCATCTTGACGA-1        PI_2 Post-involution      6820      2293 28.2991202346041
## 25806 TTTGTCATCTTTAGGG-1        PI_2 Post-involution      4747      1950  31.303981461976
##        percent_top_100  percent_top_200  percent_top_500 subsets_MT_sum
##              <numeric>        <numeric>        <numeric>      <numeric>
## 1     34.7475669099757 45.7877128953771 60.4090632603406            112
## 2     39.9110891578167 51.9305178233309 65.7034658763481            102
## 3     42.6953286482017 54.2290202563043 67.6395204630012            113
## 4     32.5335817998292  44.133861324637 58.9642052954422            105
## 5     38.9538661468486  50.219298245614 63.1172839506173            141
## ...                ...              ...              ...            ...
## 25802 40.8978490594319 52.1022258862325 66.1395488271003             76
## 25803 41.7832736739343 54.6046208916368   72.50244061178              7
## 25804 42.9204356654975 54.2274321580211 67.7496769429574             87
## 25805 39.3988269794721 50.7624633431085 66.8181818181818             41
## 25806 42.3846639983147 53.1072256161786 67.7269854645039             51
##       subsets_MT_detected subsets_MT_percent     total        log10_sum   log10_detected
##                 <integer>          <numeric> <numeric>        <numeric>        <numeric>
## 1                       7  0.851581508515815     13152 4.11902482011478 3.57170883180869
## 2                       6  0.839713509508521     12147 4.08450478324623 3.51877706892677
## 3                       6  0.934270359652749     12095 4.08264177815713 3.50893352605003
## 4                       5  0.815280689494526     12879 4.10991586302379 3.57657168406529
## 5                       6   1.14522417153996     12312 4.09036387947172 3.55966727838806
## ...                   ...                ...       ...              ...              ...
## 25802                   7  0.569587049389193     13343 4.12528603329366 3.52981519664463
## 25803                   2  0.227790432801822      3073 3.48770386316373 3.12904505988796
## 25804                   7  0.803027505999631     10834 4.03482891565584 3.47856649559384
## 25805                   5  0.601173020527859      6820 3.83384804953115 3.36059341356525
## 25806                   6   1.07436275542448      4747 3.67651071028255 3.29025726939452
##       log10_genes_per_umi
##                 <numeric>
## 1        1.15323645181596
## 2        1.16077395732603
## 3        1.16349932190164
## 4        1.14912162430148
## 5        1.14908601270284
## ...                   ...
## 25802     1.1686974539673
## 25803    1.11462244755549
## 25804    1.15991139475603
## 25805    1.14082472281698
## 25806    1.11739308183616
# Gene data
names(rowData(sce))
## [1] "Ensembl"  "Symbol"   "mean"     "detected"
rowData(sce)
## DataFrame with 27998 rows and 4 columns
##                  Ensembl         Symbol                 mean           detected
##              <character>    <character>            <numeric>          <numeric>
## 1     ENSMUSG00000051951           Xkr4   0.0215066263659614   1.98403472060761
## 2     ENSMUSG00000089699         Gm1992    0.016662791598853   1.53840192203364
## 3     ENSMUSG00000102343        Gm37381                    0                  0
## 4     ENSMUSG00000025900            Rp1 0.000542509493916143 0.0542509493916144
## 5     ENSMUSG00000109048            Rp1                    0                  0
## ...                  ...            ...                  ...                ...
## 27994 ENSMUSG00000079808     AC168977.1  0.00213128729752771  0.205378594125397
## 27995 ENSMUSG00000095041           PISD    0.706347361078819   38.8436797643959
## 27996 ENSMUSG00000063897          DHRSX     0.34290475083314    25.668449197861
## 27997 ENSMUSG00000096730       Vmn2r122                    0                  0
## 27998 ENSMUSG00000095742 CAAA01147332.1   0.0175928078741378   1.73990544834535

Identify low-quality cells

In most cases, We can assume most of the cells are high quality and use the median absolute deviation (MAD) from the median approach to identify cells that are outliers, presumably representing low-quality cells. The threshold used here to determined if a cell is an outlier is if it is more than 3 MADs from the median, under a normal distribution this cutoff will retain 99% of cells. The log-transformed values will be applied to the input values (with log = TRUE) as this improves resolution at small values for distribution exhibit a heavy right tail, and avoid inflation of the MAD that might compromise outlier detection on the left tail.

Note: One should be aware of factors that could affect the distribution, for example certain cells can have a high metabolic rate and thus higher mitochondrial gene expression, or certain cells can express very few genes. In such cases, one might need to use fixed thresholds to filter low-quality cells.

qc.lib <- isOutlier(colData(sce)$sum, nmads = 3, log = TRUE, type = "lower")
qc.expr <- isOutlier(colData(sce)$detected, nmads = 3, log = TRUE, type = "lower")
qc.mito <- isOutlier(colData(sce)$subsets_MT_percent, nmads = 3, type = "higher")

View the filter thresholds and determine if they are appropriate.

attr(qc.lib, "thresholds")  # values lower than the "lower" threshold would be filtered
##    lower   higher 
## 924.5199      Inf
attr(qc.expr, "thresholds") # values lower than the "lower" threshold would be filtered
##    lower   higher 
## 582.1861      Inf
attr(qc.mito, "thresholds") # values higher than the "higher" threshold would be filtered
##    lower   higher 
##     -Inf 1.632992

Note: The qc.mito threshold is very low, i.e. it will remove cells with mitochondrial proportions greater than ~1.63%. Usually cells with less than 10% are considered good quality and more than 20% are of poor quality. In this case, we will manually set a fixed threshold of 10%.

qc.mito.threshold = 10 # set at 10%
qc.mito <- colData(sce)$subsets_MT_percent > qc.mito.threshold
attr(qc.mito, "thresholds")["lower"] <- -Inf
attr(qc.mito, "thresholds")["higher"] <- qc.mito.threshold

attr(qc.mito, "thresholds")
##  lower higher 
##   -Inf     10

Summarize the number of cells removed for each reason. In this example, all cells passed the thresholds and none are considered an outlier.

discard <- qc.lib | qc.expr | qc.mito
colData(sce)$discard <- discard # Add a column in colData to store QC filter result
DataFrame(LibSize = sum(qc.lib), ExprGene = sum(qc.expr), MitoProp = sum(qc.mito), 
      Total = sum(discard))
## DataFrame with 1 row and 4 columns
##     LibSize  ExprGene  MitoProp     Total
##   <integer> <integer> <integer> <integer>
## 1         0         0         0         0
# Show as Venn diagram
limma::vennDiagram(data.frame(LibSize = qc.lib, ExprGene = qc.expr, MitoProp = qc.mito),
           circle.col = circle.col)

Assess QC metrics on cells

Now we are ready to inspect the distributions of various metrics and the thresholds chosen earlier. In an ideal case, we will see these metrics follow normal distributions and thus would justify the 3 MAD thresholds used in outlier detection. Afer assessing the plots, we can decide if the thresholds need adjustment to account for specific biological states or subset of cells, etc.

Cell counts

The cell counts are determined by the number of unique cellular barcodes detected. Ideally, the number of unique cellular barcodes will correpsond to the number of cells loaded. However the capture rates or capture efficiency of cells will affect the eventual cell counts. Accurate measure the input cell concentration is also important in determining the cell capture efficiency. Lastly, it is also possible to detect cell numbers that are much higher than what we loaded due to the experimental procedure. For example, there is a chance of obtaining only a barcoded bead in the emulsion droplet (GEM) and no actual cell with the 10X protocol.

Note: What were the expected cell counts in samples?

table(colData(sce)$Sample)
## 
##  G_1  G_2  L_1  L_2 NP_1 NP_2 PI_1 PI_2 
## 2915 3106 5906 3697 2249 2127 1500 4306
ggplot(as.data.frame(colData(sce)), aes(Sample, fill = Sample)) + geom_bar(color = "black") + 
    geom_text(stat = "count", aes(label = ..count..), vjust = -0.5, size = 5) +
    scale_fill_manual(values = sample_col) + theme_classic(base_size = 12) + 
    theme(legend.position = "none") + ylab("Counts") + ggtitle("Cell counts")

Library size

Next, we will consider the total number of RNA molecules detected per cell. The unique molecular identifier (UMI) counts should generally be above 500 (ref). Wells with few transcripts are likely to have been broken or failed to capture a cell, and should thus be removed. If UMI counts are between 500-1000 counts, it is usable but the cells probably should have been sequenced more deeply (ref).

View library size distributions with quantile.

Note: Are there any cells with low total UMI counts?

quantile(colData(sce)$sum, seq(0, 1, 0.1))  # 0% - 100% percentile
##       0%      10%      20%      30%      40%      50%      60%      70%      80%      90% 
##   1724.0   2880.0   3704.0   4576.5   5359.0   6285.5   7434.0   8961.5  11088.0  15572.0 
##     100% 
## 116555.0
quantile(colData(sce)$sum, seq(0.9, 1, 0.1))    # 90% - 100% percentile (high read-depth)
##    90%   100% 
##  15572 116555

Visualise library size distributions with ggplot.

logbreak = scales::trans_breaks("log10", function(x) 10^x)
loglab = scales::trans_format("log10", scales::math_format(10^.x))

# histogram
p1 <- ggplot(as.data.frame(colData(sce)), aes(sum, color = Sample, fill = Sample)) +
    geom_histogram(position = "identity", alpha = 0.4, bins = 50) +
    scale_x_log10(breaks = logbreak, labels = loglab) +
    geom_vline(xintercept = attr(qc.lib, "thresholds")[1], linetype = 2) + # show cutoff
    scale_fill_manual(values = sample_col) + scale_color_manual(values = sample_col) +
    theme_classic(base_size = 12) + xlab("Total UMI counts") + 
    ylab("Frequency") + ggtitle("Histogram of Library Size per Cell")

# violin plot
p2 <- ggplot(as.data.frame(colData(sce)), aes(Sample, sum, colour = discard)) +
    geom_violin(width = 1) + scale_y_log10(breaks = logbreak, labels = loglab) +
    geom_quasirandom(size = 0.2, alpha = 0.2, width = 0.5) +
    geom_hline(yintercept = attr(qc.lib, "thresholds")[1], linetype = 2) + # show cutoff
    scale_color_manual(values = c("blue", "red")) + theme_classic(base_size = 12) +
    guides(color = guide_legend("Discard", override.aes = list(size = 1, alpha = 1))) +
    theme(legend.position = "right") + ylab("Total UMI counts") + 
    ggtitle("Violin plot of Library Size perl Cell")

multiplot(p1, p2)

Expressed features

Aside from having sufficient sequencing depth for each sample, we also expected to see reads distributed across the transcriptome. When visualised the expressed features (i.e. genes) in all the cells as a histogram or density plot, the plot should contain a single large peak for high quality data.

View expressed features distributions with quantile.

quantile(colData(sce)$detected, seq(0, 1, 0.1))
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##  604 1223 1473 1731 1948 2167 2407 2710 3122 3786 8795

Visualise expressed features distributions with ggplot.

# histogram
p1 <- ggplot(as.data.frame(colData(sce)), aes(detected, color = Sample, fill = Sample)) +
    geom_histogram(position = "identity", alpha = 0.4, bins = 50) + 
    scale_x_log10(breaks = logbreak, labels = loglab) +
    geom_vline(xintercept = attr(qc.expr, "thresholds")[1], linetype = 2) + # show cutoff
    scale_fill_manual(values = sample_col) + scale_color_manual(values = sample_col) +
    theme_classic(base_size = 12) + xlab("Total expressed genes") + 
    ylab("Frequency") + ggtitle("Histogram of Expressed Features per Cell")

# violin
p2 <- ggplot(as.data.frame(colData(sce)), aes(Sample, detected, colour = discard)) +
    geom_violin(width = 1) + scale_y_log10(breaks = logbreak, labels = loglab) +
    geom_quasirandom(size = 0.2, alpha = 0.2, width = 0.5) +
    geom_hline(yintercept = attr(qc.expr, "thresholds")[1], linetype = 2) + # show cutoff
    scale_colour_manual(values = c("blue", "red")) + theme_classic(base_size = 12) +
    guides(color = guide_legend("Discard", override.aes = list(size = 1, alpha = 1))) +
    theme(legend.position = "right") + ylab("Total expressed genes") + 
    ggtitle("Violin plot of Expressed Features per Cell")

multiplot(p1, p2)

Complexity of RNA species

The UMI count per cell and the number of genes detected per cell are often evaluated together. These two indices are usually strongly related, i.e. the higher UMI count for a cell, the more genes are detected as well. Cells that have a less complex RNA species (low number of genes detected per UMI), such as red blood cells, can often be detected by this metric. Generally, we expect the complexity score to be above 0.80 ref.

View complexity distributions with quantile.

quantile(colData(sce)$log10_genes_per_umi, seq(0, 1, 0.1))
##       0%      10%      20%      30%      40%      50%      60%      70%      80%      90% 
## 1.065505 1.114984 1.122271 1.128256 1.133894 1.139502 1.145985 1.153267 1.162562 1.176168 
##     100% 
## 1.350996

Visualise complexity distributions with ggplot.

# histogram
p1 <- ggplot(as.data.frame(colData(sce)), aes(log10_genes_per_umi, color = Sample,
                                              fill = Sample)) +
        geom_histogram(position = "identity", alpha = 0.4, bins = 50) + 
    geom_vline(xintercept = 0.8, linetype = 2) +
        scale_fill_manual(values = sample_col) + scale_color_manual(values = sample_col) +
        theme_classic(base_size = 12) + xlab("log10 Gene per UMI") + ylab("Frequency") +
        ggtitle("Histogram of Complexity of Gene Expression")

# scatter plot
p2 <- ggplot(as.data.frame(colData(sce)), aes(sum, detected, color = Sample)) + 
    scale_x_log10(breaks = logbreak, labels = loglab) +
    scale_y_log10(breaks = logbreak, labels = loglab) +
    geom_point(size = 0.6, alpha = 0.3) + facet_wrap(~ as.data.frame(colData(sce))$Sample) + 
    scale_color_manual(values = sample_col) + theme_classic(base_size = 12) +
    guides(color = guide_legend("Sample", override.aes = list(size = 3, alpha = 1))) +
    xlab("Total UMI counts") + ylab("Total expressed genes") + 
    ggtitle("UMIs vs. Expressed Genes")

multiplot(p1, p2)

Mitochondrial contamination

The mitochondrial proportions in cells is an useful indicator of cell quality. High proportion of counts assigned to mitochondrial genes is an indication of damaged, dying and dead cells, whereby cytoplasmic mRNA has leaked out through a broken membrane, hence only the mRNA located in the mitochondria is preserved and being sequenced.

View distributions with quantile.

Note: Are there any cells with high expression of mitochondrial genes (>20% of total counts in a cell)?

quantile(colData(sce)$subsets_MT_percent, seq(0, 1, 0.1))
##        0%       10%       20%       30%       40%       50%       60%       70%       80% 
## 0.0000000 0.4207468 0.5272408 0.6069388 0.6817726 0.7510203 0.8267616 0.9212001 1.0394179 
##       90%      100% 
## 1.2505218 8.2174054

Visualise distributions with ggplot.

# histogram
p1 <- ggplot(as.data.frame(colData(sce)), aes(x = subsets_MT_percent, color = Sample, 
                           fill = Sample)) +
    geom_histogram(position = "identity", alpha = 0.4, bins = 50) +
    geom_vline(xintercept = attr(qc.mito, "thresholds")[2], linetype = 2) + # show cutoff
    scale_fill_manual(values = sample_col) + scale_color_manual(values = sample_col) +
    theme_classic(base_size = 12) + xlab("% counts from mitochondrial genes") + 
    ylab("Frequency") + ggtitle("Histogram of Mitochondrial Proportions per Cell")

# violin
p2 <- ggplot(as.data.frame(colData(sce)), aes(Sample, subsets_MT_percent, colour = discard)) +
        geom_violin(width = 1) + geom_quasirandom(size = 0.2, alpha = 0.2, width = 0.5) +
        geom_hline(yintercept = attr(qc.mito, "thresholds")[2], linetype = 2) + # show cutoff
        scale_colour_manual(values = c("blue", "red")) + theme_classic(base_size = 12) +
    guides(color = guide_legend("Discard", override.aes = list(size = 1, alpha = 1))) +
        theme(legend.position = "right") + ylab("% counts from mitochondrial genes") +
        ggtitle("Violin plot of Mitochondrial Proportions per Cell")

multiplot(p1, p2)

Cell filtering

After reviewing the diagnostic plots, we will proceed with removing low-quality cells from sce with the thresholds that were selected in Identify low-quality cells. You can also change the thresholds that is suitable for your study to prevent removing biologically relevant cells.

Remove outlier cells

In this example, none of the cells are outliers, and hence none were removed.

sce.filt <- sce[, !discard]
sce.filt
## class: SingleCellExperiment 
## dim: 27998 25806 
## metadata(0):
## assays(1): counts
## rownames: NULL
## rowData names(4): Ensembl Symbol mean detected
## colnames: NULL
## colData names(17): Barcode Sample ... log10_genes_per_umi discard
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):

Update per-feature QC

Because poor-quality cells were removed, we will update the per-feature QC metrics whereby some calculations uses cell data.

rowData(sce.filt) <- cbind(rowData(sce.filt)[,c("Ensembl", "Symbol")], 
               perFeatureQCMetrics(sce.filt))

# Add additional stats to per-feature QC
rowData(sce.filt)$n_cells <- rowData(sce.filt)$detected/100 * ncol(sce.filt) # number of expressing cells
rowData(sce.filt)$pct_dropout <- 100 - rowData(sce.filt)$detected # percentage of cells with zero counts
rowData(sce.filt)$total_counts <- rowData(sce.filt)$mean * ncol(sce.filt)

Assess QC metrics on features

Show number of genes that are not expressed in any cell.

not.expressed <- rowData(sce.filt)$n_cells == 0
table(not.expressed)
## not.expressed
## FALSE  TRUE 
## 21140  6858

We remove genes that are not expressed in any cell

sce.filt <- sce.filt[!not.expressed,]
sce.filt
## class: SingleCellExperiment 
## dim: 21140 25806 
## metadata(0):
## assays(1): counts
## rownames: NULL
## rowData names(7): Ensembl Symbol ... pct_dropout total_counts
## colnames: NULL
## colData names(17): Barcode Sample ... log10_genes_per_umi discard
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):

Aside from the 6858 genes that are not expressed in any cell, there are also genes that have extremely low average expression across all cells. We can also observe a positive relationship between the number of of expressing cells versus mean counts of all the genes in the expression matrix.

# histogram
p1 <- ggplot(as.data.frame(rowData(sce.filt)), aes(x = mean)) +
    geom_histogram(position = "identity", bins = 100, fill = "white", color = "black") +
    scale_x_log10(breaks = logbreak, labels = loglab) +
    geom_vline(xintercept = 0.005, linetype = 2, color = "cyan") +  # mean count = 0.005
    geom_vline(xintercept = 0.010, linetype = 2, color = "blue") +  # mean count = 0.010
    geom_vline(xintercept = 0.020, linetype = 2, color = "red") +   # mean count = 0.020
    theme_classic(base_size = 12) + xlab("Mean counts") +
    ylab("Frequency") + ggtitle("Histogram of Mean Counts")

# scatter plot
p2 <- ggplot(as.data.frame(rowData(sce.filt)), aes(n_cells, mean, color = pct_dropout)) + 
    geom_point(size = 0.6, alpha = 0.3) + theme_classic(base_size = 12) + 
    scale_x_log10(breaks = logbreak, labels = loglab) + 
    scale_y_log10(breaks = logbreak, labels = loglab) +
    geom_hline(yintercept = 0.005, linetype = 2, color = "cyan") +
    geom_hline(yintercept = 0.010, linetype = 2, color = "blue") +
    geom_hline(yintercept = 0.020, linetype = 2, color = "red") +
    guides(color = guide_legend("Pct Dropouts", override.aes = list(size = 3, alpha = 1))) + 
    xlab("Number of expressing cells") + ylab("Mean counts") + 
    ggtitle("Expressing Cells vs. Mean Counts")

multiplot(p1, p2)

Feature filtering

Compare filtering methods

After assessing the QC metrics on features, we can proceeed to remove genes that is considered “undetectable”. This can be done by define a gene as detectable if the average expression is above a certain threshold, or a N number of cells expressing X number of the transcript.

# Average expression more than 0.02
qc.gene1 <- rowData(sce.filt)$mean > 0.02

# 2 transcript detected in at least 0.1% of total cells
ncell <- ncol(sce.filt) * 0.001
ncell
## [1] 25.806
qc.gene2 <- nexprs(sce.filt, byrow = TRUE, detection_limit = 2) >= ncell

x = data.frame(n_cells = rowData(sce.filt)$n_cells, mean = rowData(sce.filt)$mean, 
           qc1 = qc.gene1, qc2 = qc.gene2)
x$cond = paste0(x$qc1, x$qc2)
x$cond = as.factor(x$cond)
levels(x$cond) = c("Failed (Both)", "Failed (mean)", "Failed (nexprs)", "Pass")

table(x$cond)
## 
##   Failed (Both)   Failed (mean) Failed (nexprs)            Pass 
##            9605              82            3191            8262
# Show as Venn diagram
limma::vennDiagram(data.frame(MeanExpr = qc.gene1, kOverA = qc.gene2),
                   circle.col = c("blue", "red"), main = "Detectable Genes")

# scatter plots
p3 <- ggplot(x, aes(n_cells, mean, color = cond)) + 
    geom_point(size = 0.6, alpha = 0.3) + theme_classic(base_size = 12) +
        scale_x_log10(breaks = logbreak, labels = loglab) +
        scale_y_log10(breaks = logbreak, labels = loglab) +
    geom_hline(yintercept = 0.02, linetype = 2, color = "blue") + # mean count = 0.005
        guides(color = guide_legend("Feature Status", override.aes = list(size = 3, alpha = 1))) +
        xlab("Number of expressing cells") + ylab("Mean counts") +
        ggtitle("Expressing Cells vs. Mean Counts")

p4 <- ggplot(x, aes(n_cells, mean, color = cond)) + geom_point(size = 0.6, alpha = 0.3) + 
    facet_wrap(~ cond) + theme_classic(base_size = 12) + 
    scale_x_log10(breaks = logbreak, labels = loglab) +
    scale_y_log10(breaks = logbreak, labels = loglab) +
    geom_hline(yintercept = 0.02, linetype = 2, color = "blue") + # mean count = 0.005
    guides(color = guide_legend("Feature Status", override.aes = list(size = 3, alpha = 1))) +
    xlab("Number of expressing cells") + ylab("Mean counts") +
    ggtitle("Expressing Cells vs. Mean Counts")

multiplot(p3, p4)

Remove lowly-expressed genes

Here we will use the classical kOverA approach to select genes that can be detected (2 transcripts) in at least 0.1% of total cells.

sce.filt <- sce.filt[qc.gene2,]
sce.filt
## class: SingleCellExperiment 
## dim: 8344 25806 
## metadata(0):
## assays(1): counts
## rownames: NULL
## rowData names(7): Ensembl Symbol ... pct_dropout total_counts
## colnames: NULL
## colData names(17): Barcode Sample ... log10_genes_per_umi discard
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):

Update per-cell QC

Because lowly-expressed genes were removed, we will update the per-cell QC metrics whereby some calculations uses gene data.

colData(sce.filt) <- cbind(colData(sce.filt)[,c("Barcode","Sample","Condition")], 
               perCellQCMetrics(sce.filt, list(MT = grep("^mt-|^MT-", rowData(sce.filt)$Symbol))))

Add average expression

Use the calculateAverage function to average counts per feature after normalizing with size factors.

rowData(sce.filt)$ave_counts <- calculateAverage(sce.filt)

r.squared = summary(lm(mean ~ ave_counts, rowData(sce.filt)))$r.squared
r.squared = as.expression(bquote(R^2 == .(round(r.squared, 4))))

ggplot(as.data.frame(rowData(sce.filt)), aes(mean, ave_counts)) + 
    geom_point(size = 0.6, alpha = 0.3) + theme_classic(base_size = 12) + 
    scale_x_log10(breaks = logbreak, labels = loglab) + 
    scale_y_log10(breaks = logbreak, labels = loglab) + 
    geom_smooth(method = "lm", se = FALSE) + 
    annotate("text", label = r.squared, x = 0.1, y = 100, size = 4) +
    xlab("Mean counts") + ylab("Size-adjusted average count") + 
    ggtitle("Mean Counts vs. Size-adjusted Average Count")

Add log2 counts to sce.filt

In addition to the count data in assays, we will also add the log2-transformed counts to assays.

# Accessing the assay data
dim(assay(sce.filt, "counts"))
## [1]  8344 25806

log2 raw counts

assay(sce.filt, "logcounts") <- log2(counts(sce.filt) + pseudocount)

log2 count-per-million (CPM)

The effective library sizes are used as the denominator of the CPM calculation.

# More about lib.sizes calculation in calculateCPM()
# x is a numeric matrix of counts
x <- assay(sce, "counts")
size.factors <- colSums(x)/mean(colSums(x))     # same as scater::librarySizeFactors(x)
lib.sizes <- colSums(x) / 1e6                   # count-per-million
lib.sizes <- size.factors / mean(size.factors) * mean(lib.sizes) # normalisation
assay(sce.filt, "logcpm") <- log2(calculateCPM(sce.filt) + pseudocount)
# Assay info
str(assays(sce.filt))
## Formal class 'SimpleList' [package "S4Vectors"] with 4 slots
##   ..@ listData       :List of 3
##   .. ..$ counts   :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   .. .. .. ..@ i       : int [1:56501178] 5 7 8 9 11 12 13 16 17 21 ...
##   .. .. .. ..@ p       : int [1:25807] 0 3424 6458 9443 12904 16161 17983 20490 23122 25199 ...
##   .. .. .. ..@ Dim     : int [1:2] 8344 25806
##   .. .. .. ..@ Dimnames:List of 2
##   .. .. .. .. ..$ : NULL
##   .. .. .. .. ..$ : NULL
##   .. .. .. ..@ x       : num [1:56501178] 1 2 6 1 1 4 1 1 1 1 ...
##   .. .. .. ..@ factors : list()
##   .. ..$ logcounts:Formal class 'dgeMatrix' [package "Matrix"] with 4 slots
##   .. .. .. ..@ x       : num [1:215325264] 0 0 0 0 0 ...
##   .. .. .. ..@ Dim     : int [1:2] 8344 25806
##   .. .. .. ..@ Dimnames:List of 2
##   .. .. .. .. ..$ : NULL
##   .. .. .. .. ..$ : NULL
##   .. .. .. ..@ factors : list()
##   .. ..$ logcpm   :Formal class 'dgeMatrix' [package "Matrix"] with 4 slots
##   .. .. .. ..@ x       : num [1:215325264] 0 0 0 0 0 ...
##   .. .. .. ..@ Dim     : int [1:2] 8344 25806
##   .. .. .. ..@ Dimnames:List of 2
##   .. .. .. .. ..$ : NULL
##   .. .. .. .. ..$ : NULL
##   .. .. .. ..@ factors : list()
##   ..@ elementType    : chr "ANY"
##   ..@ elementMetadata: NULL
##   ..@ metadata       : list()

Save data

save.image("BachMammary.RData")

Session information

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /home/ihsuan/miniconda3/envs/jupyterlab/lib/libopenblasp-r0.3.10.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C               LC_TIME=en_GB.UTF-8       
##  [4] LC_COLLATE=en_GB.UTF-8     LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
## [10] LC_TELEPHONE=C             LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets  methods  
## [9] base     
## 
## other attached packages:
##  [1] ggbeeswarm_0.6.0            DropletUtils_1.6.1          scater_1.14.6              
##  [4] ggplot2_3.3.2               scRNAseq_2.0.2              SingleCellExperiment_1.8.0 
##  [7] SummarizedExperiment_1.16.1 DelayedArray_0.12.3         BiocParallel_1.20.1        
## [10] matrixStats_0.56.0          Biobase_2.46.0              GenomicRanges_1.38.0       
## [13] GenomeInfoDb_1.22.1         IRanges_2.20.2              S4Vectors_0.24.4           
## [16] BiocGenerics_0.32.0         knitr_1.29                 
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-149                  bitops_1.0-6                 
##  [3] bit64_4.0.5                   httr_1.4.2                   
##  [5] tools_3.6.3                   R6_2.4.1                     
##  [7] irlba_2.3.3                   HDF5Array_1.14.4             
##  [9] vipor_0.4.5                   mgcv_1.8-33                  
## [11] DBI_1.1.0                     colorspace_1.4-1             
## [13] withr_2.2.0                   tidyselect_1.1.0             
## [15] gridExtra_2.3                 bit_4.0.4                    
## [17] curl_4.3                      compiler_3.6.3               
## [19] BiocNeighbors_1.4.2           labeling_0.3                 
## [21] scales_1.1.1                  rappdirs_0.3.1               
## [23] stringr_1.4.0                 digest_0.6.25                
## [25] rmarkdown_2.3                 R.utils_2.10.1               
## [27] XVector_0.26.0                pkgconfig_2.0.3              
## [29] htmltools_0.5.0               limma_3.42.2                 
## [31] dbplyr_1.4.4                  fastmap_1.0.1                
## [33] rlang_0.4.7                   RSQLite_2.2.0                
## [35] shiny_1.5.0                   DelayedMatrixStats_1.8.0     
## [37] farver_2.0.3                  generics_0.0.2               
## [39] R.oo_1.24.0                   dplyr_1.0.2                  
## [41] RCurl_1.98-1.2                magrittr_1.5                 
## [43] BiocSingular_1.2.2            GenomeInfoDbData_1.2.2       
## [45] Matrix_1.2-18                 Rcpp_1.0.5                   
## [47] munsell_0.5.0                 Rhdf5lib_1.8.0               
## [49] viridis_0.5.1                 lifecycle_0.2.0              
## [51] R.methodsS3_1.8.1             edgeR_3.28.1                 
## [53] stringi_1.4.6                 yaml_2.2.1                   
## [55] zlibbioc_1.32.0               rhdf5_2.30.1                 
## [57] BiocFileCache_1.10.2          AnnotationHub_2.18.0         
## [59] grid_3.6.3                    blob_1.2.1                   
## [61] dqrng_0.2.1                   promises_1.1.1               
## [63] ExperimentHub_1.12.0          crayon_1.3.4                 
## [65] lattice_0.20-41               splines_3.6.3                
## [67] locfit_1.5-9.4                pillar_1.4.6                 
## [69] glue_1.4.2                    BiocVersion_3.10.1           
## [71] evaluate_0.14                 BiocManager_1.30.10          
## [73] vctrs_0.3.4                   httpuv_1.5.4                 
## [75] gtable_0.3.0                  purrr_0.3.4                  
## [77] assertthat_0.2.1              xfun_0.16                    
## [79] rsvd_1.0.3                    mime_0.9                     
## [81] xtable_1.8-4                  later_1.1.0.1                
## [83] viridisLite_0.3.0             tibble_3.0.3                 
## [85] AnnotationDbi_1.48.0          beeswarm_0.2.3               
## [87] memoise_1.1.0                 ellipsis_0.3.1               
## [89] interactiveDisplayBase_1.24.0